library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(readr)
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.4-8, (SVN revision 845)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.4.2, released 2019/06/28
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ.4 runtime: Rel. 5.2.0, September 15th, 2018, [PJ_VERSION: 520]
## Path to PROJ.4 shared files: /Library/Frameworks/R.framework/Versions/3.6/Resources/library/rgdal/proj
## Linking to sp version: 1.3-2
library(sp)
library(tmap)
library(leaflet)
library(ggplot2)
library(ggthemes)
New York’s Inequality Meets COVID-19
Project Objective
- Visualize COVID-19 positive cases at zipcode level, and explore if there is any correlation with demographic features such as median income, educational attainment, and race.
- Select key zipcodes with the highest and lowest positive cases per 1,000 residents and explore if we can find interesting observations in terms of daily growth rate –> can this mean that certion regions are not practicing social distancing enough?
Data Visualizations
- 1 chloropleth map to show positive cases per 1,000 residents at zipcode level —> hover label to show education attainment, income (i.e., % of population below the poverty line), and race (% of Black residents)
- scatterplot to show corrleation betwen COVID-19 & income level (median income at zip code level), group it by borough
- line chart to see growth trajectory of zey areas
Loading Data
#corona data
summary <- read.csv("summary.csv")
boro <- read.csv("boro.csv")
daily <- read.csv("daily.csv")
case <- read.csv("case-hosp-death.csv")
test_zip <- read.csv("tests_zip.csv")
#Combined data set of COVID-19 cases and demographics data at zipcode level
#demographics data at zipcode level was attained through the uszipcodes API and datasets
#were combined in Python
zip_demo <- read.csv("zip_with_demo_v2.csv")
zip_demo <- zip_demo %>% rename("ZIPCODE" = "MODZCTA")
zip_demo2 <- zip_demo %>% mutate(CasesPer1000 = (Positive * 1000) / Total)
zip_median <- read.csv("final.csv")
zip_median <- zip_median %>% select("MODZCTA", "median_household_income")
zip_median <- zip_median %>% rename("ZIPCODE" = "MODZCTA")
zip_demo2 <- left_join(zip_demo2, zip_median, by = "ZIPCODE")
Choloropleth map to show positive caes by zipcode
nyc <- readOGR(dsn = "/Users/sunghwapark/Desktop/DataVisFinalProj/ZIP_CODE_040114", layer = "ZIP_CODE_040114")
## OGR data source with driver: ESRI Shapefile
## Source: "/Users/sunghwapark/Desktop/DataVisFinalProj/ZIP_CODE_040114", layer: "ZIP_CODE_040114"
## with 263 features
## It has 12 fields
#class(nyc)
#str(nyc, max.level=2)
nyc2 <- nyc
Static Map for Positive Cases
zip_demo2$ZIPCODE <- as.character(zip_demo2$ZIPCODE)
nyc2@data <- nyc2@data %>%
left_join(zip_demo2, by = "ZIPCODE")
## Warning: Column `ZIPCODE` joining factor and character vector, coercing
## into character vector
#popup = c("Zipcode"="ZIPCODE", "COVID-19 Case"= "Positive")
tm <- tm_shape(nyc2) + tm_fill("Positive", palette = "BuPu") + tm_borders(alpha=.5, col = "white")
tm

Static Map for Positive Cases per 1,000 Residents
tm2 <- tm_shape(nyc2) + tm_fill("CasesPer1000", palette = "Blues") + tm_borders(alpha=.5, col = "white")
tm2

Interactive Map of Positive Cases
tm_leaflet <- tmap_leaflet(tm)
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
m <- tm_leaflet %>% addTiles('http://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}.png')
content <- paste("Zipcode:",nyc2@data$ZIPCODE, "<br/>",
"COVID-19 CASES:",nyc2@data$Positive,"<br/>",
"Demographics:","<br/>",
round(nyc2@data$population_by_race.Black.Or.African.American*100, 1), "of residents are Black", "<br/>",
round(nyc2@data$household_income....25.000*100, 1), "of people live below the poverty line", "<br/>",
round(nyc2@data$educational_attainment_for_population_25_and_over.Less.Than.High.School.Diploma*100, 1), "of people have less than high school diploma")
m
tm_leaflet3 <- tmap_leaflet(tm)
tm_leaflet3$x$calls[[5]]$args[[7]] <- leaflet:::evalFormula(
~content,
data=nyc2
)
tm_leaflet3$x$calls[[5]]$args[[9]] = highlightOptions = highlightOptions(
color='#000000', weight = 3,
bringToFront = TRUE, sendToBack = TRUE)
tm_leaflet3 %>% addTiles('http://{s}.basemaps.cartocdn.com/light_all/{z}/{x}/{y}.png')
Scatterplot to visualize correlation between COVID19 and income
nycdata <- nyc2@data
nycdata2 <- nyc2@data
nycdata2 <- nycdata2 %>% mutate(population_by_race_nonWhite = (1 - population_by_race.White))
nycdata2$positive_per_100k <- as.numeric(as.character(nycdata2$positive_per_100k))
nycdata2$median_household_income <- as.numeric(as.character(nycdata2$median_household_income))
gg <- ggplot(nycdata2, aes(x = median_household_income, y = positive_per_100k, color =
-educational_attainment_for_population_25_and_over.High.School.Graduate)) + xlim(20000, 180000) + geom_point(size = 2, alpha = 1) + geom_smooth(aes(group = 1),
method = "lm",
se = FALSE,
color = "lightpink3") + theme_economist()
gg
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 76 rows containing non-finite values (stat_smooth).
## Warning: Removed 76 rows containing missing values (geom_point).
